## [1] "December 22, 2025"

Podsumowanie

Biblioteki

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.5.2
library(plotly)
## Warning: package 'plotly' was built under R version 4.5.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: lattice
library(shapper)
## Warning: package 'shapper' was built under R version 4.5.2

Wczytywanie danych

data <- read.csv("data.csv")

Podsumowanie zbioru surowego

nrow(data)
## [1] 925
summary(data)
##      Ref.           Limits.of.Potential.Window..V.
##  Length:925         Length:925                    
##  Class :character   Class :character              
##  Mode  :character   Mode  :character              
##                                                   
##                                                   
##                                                   
##                                                   
##  Lower.Limit.of.Potential.Window..V. Upper.Limit.of.Potential.Window..V.
##  Min.   :-1.1000                     Min.   :-0.2000                    
##  1st Qu.:-0.3000                     1st Qu.: 0.4000                    
##  Median : 0.0000                     Median : 0.6000                    
##  Mean   :-0.2343                     Mean   : 0.6301                    
##  3rd Qu.: 0.0000                     3rd Qu.: 0.8000                    
##  Max.   : 0.2000                     Max.   : 3.5000                    
##  NA's   :4                           NA's   :4                          
##  Potential.Window..V. Current.Density..A.g. Capacitance..F.g.
##  Min.   :0.4000       Min.   :  0.050       Min.   :   1.4   
##  1st Qu.:0.6000       1st Qu.:  1.000       1st Qu.: 148.6   
##  Median :0.8250       Median :  2.000       Median : 260.2   
##  Mean   :0.8634       Mean   :  5.857       Mean   : 415.5   
##  3rd Qu.:1.0000       3rd Qu.:  5.000       3rd Qu.: 509.9   
##  Max.   :3.5000       Max.   :200.000       Max.   :3344.1   
##  NA's   :5            NA's   :16            NA's   :17       
##  Specific.Surface.Area..m.2.g. Charge.Transfer.Resistance..Rct...ohm.
##  Min.   :   8.896              Min.   : 0.080                        
##  1st Qu.:  57.000              1st Qu.: 0.670                        
##  Median : 159.970              Median : 1.540                        
##  Mean   : 417.438              Mean   : 3.048                        
##  3rd Qu.: 546.000              3rd Qu.: 3.240                        
##  Max.   :2400.000              Max.   :24.200                        
##  NA's   :572                   NA's   :786                           
##  Equivalent.Series.Resistance..Rs...ohm. Electrode.Configuration
##  Min.   : 0.200                          Length:925             
##  1st Qu.: 0.350                          Class :character       
##  Median : 0.580                          Mode  :character       
##  Mean   : 1.602                                                 
##  3rd Qu.: 2.000                                                 
##  Max.   :17.500                                                 
##  NA's   :772                                                    
##  Pore.Size..nm.   Pore.Volume..cm.3.g. Ratio.of.ID.IG      N.at.      
##  Min.   : 0.530   Min.   :0.0200       Min.   :0.120   Min.   : 0.00  
##  1st Qu.: 3.045   1st Qu.:0.1680       1st Qu.:0.940   1st Qu.: 0.00  
##  Median : 4.337   Median :0.2170       Median :1.050   Median : 0.00  
##  Mean   : 8.618   Mean   :0.4857       Mean   :1.121   Mean   : 2.50  
##  3rd Qu.:13.625   3rd Qu.:0.5075       3rd Qu.:1.170   3rd Qu.: 3.20  
##  Max.   :44.131   Max.   :2.3500       Max.   :2.900   Max.   :23.82  
##  NA's   :769      NA's   :729          NA's   :596     NA's   :690    
##      C.at.           O.at.        Electrolyte.Chemical.Formula
##  Min.   : 1.40   Min.   : 1.900   Length:925                  
##  1st Qu.:37.32   1st Qu.: 8.883   Class :character            
##  Median :81.00   Median :13.700   Mode  :character            
##  Mean   :66.52   Mean   :19.176                               
##  3rd Qu.:85.58   3rd Qu.:27.098                               
##  Max.   :98.10   Max.   :54.280                               
##  NA's   :699     NA's   :703                                  
##  Electrolyte.Ionic.Conductivity Electrolyte.Concentration..M.
##  Min.   :1.000                  Min.   :0.100                
##  1st Qu.:6.000                  1st Qu.:1.000                
##  Median :6.000                  Median :1.000                
##  Mean   :5.806                  Mean   :2.576                
##  3rd Qu.:7.000                  3rd Qu.:6.000                
##  Max.   :8.000                  Max.   :6.000                
##  NA's   :99                     NA's   :62                   
##  Cell.Configuration..three.two.electrode.system.
##  Length:925                                     
##  Class :character                               
##  Mode  :character                               
##                                                 
##                                                 
##                                                 
## 

Przetwarzanie wartości brakujących

Zasady czyszczenia danych:

keep_cols <- colSums(is.na(data)) < nrow(data) / 2
keep_cols["Limits.of.Potential.Window..V."] <- FALSE
medians <- sapply(data[, sapply(data, is.numeric)], median, na.rm = TRUE)
median_cols <- intersect(colnames(data)[keep_cols], names(medians))

data <- select(data, colnames(data)[keep_cols])
keep_rows <- rowSums(is.na(data)) < sum(keep_cols) / 2
data <- filter(data, keep_rows)
for (col in median_cols) {
  data[is.na(data[, col]), col] <- medians[col]
}

str_strip <- function(x) {
  x_sub <- x
  while (substring(x_sub, 1, 1) == " ") {
    x_sub <- substring(x_sub, 2)
  }
  while (substring(x_sub, nchar(x_sub)) == " ") {
    x_sub <- substring(x_sub, 1, nchar(x_sub) - 1)
  }
  return(x_sub)
}

string_cols <- colnames(data)[!sapply(data, is.numeric)]
for (col in string_cols) {
  data[, col] <- sapply(data[, col], str_strip)
  tt <- table(data[, col])
  max_val <- names(tt[tt == max(tt)])[1]
  data[data[, col] == "", col] <- max_val
}

Podsumowanie zbioru po czyszczeniu

nrow(data)
## [1] 925
summary(data)
##      Ref.           Lower.Limit.of.Potential.Window..V.
##  Length:925         Min.   :-1.1000                    
##  Class :character   1st Qu.:-0.3000                    
##  Mode  :character   Median : 0.0000                    
##                     Mean   :-0.2333                    
##                     3rd Qu.: 0.0000                    
##                     Max.   : 0.2000                    
##  Upper.Limit.of.Potential.Window..V. Potential.Window..V. Current.Density..A.g.
##  Min.   :-0.2000                     Min.   :0.4000       Min.   :  0.05       
##  1st Qu.: 0.4000                     1st Qu.:0.6000       1st Qu.:  1.00       
##  Median : 0.6000                     Median :0.8250       Median :  2.00       
##  Mean   : 0.6299                     Mean   :0.8632       Mean   :  5.79       
##  3rd Qu.: 0.8000                     3rd Qu.:1.0000       3rd Qu.:  5.00       
##  Max.   : 3.5000                     Max.   :3.5000       Max.   :200.00       
##  Capacitance..F.g. Electrode.Configuration Electrolyte.Chemical.Formula
##  Min.   :   1.4    Length:925              Length:925                  
##  1st Qu.: 150.8    Class :character        Class :character            
##  Median : 260.2    Mode  :character        Mode  :character            
##  Mean   : 412.6                                                        
##  3rd Qu.: 493.6                                                        
##  Max.   :3344.1                                                        
##  Electrolyte.Ionic.Conductivity Electrolyte.Concentration..M.
##  Min.   :1.000                  Min.   :0.10                 
##  1st Qu.:6.000                  1st Qu.:1.00                 
##  Median :6.000                  Median :1.00                 
##  Mean   :5.827                  Mean   :2.47                 
##  3rd Qu.:7.000                  3rd Qu.:6.00                 
##  Max.   :8.000                  Max.   :6.00                 
##  Cell.Configuration..three.two.electrode.system.
##  Length:925                                     
##  Class :character                               
##  Mode  :character                               
##                                                 
##                                                 
## 

Szczegółowa analiza wartości atrybutów

for (col in colnames(data)) {
  if (is.numeric(data[, col])) {
    hist(data[, col], main = col,
         xlab = "Wartość", ylab = "Wystąpienia")
  } else if (!(col %in% c("Ref.", "Electrode.Configuration"))) {
    tt <- table(data[, col])
    val_cnt <- data.frame(vals = names(tt),
                          counts = as.numeric(tt))
    print(ggplot(val_cnt, aes(x = vals, y = counts)) +
      geom_bar(stat = "identity") +
      labs(x = "Wartość", y = "Wystąpienia", title = col) +
      theme(axis.text.x = element_text(angle = 30, vjust = 0.5)))
  }
}

Korelacja

elec_conf_vallist <- unique(data$Electrode.Configuration)
elec_chem_vallist <- unique(data$Electrolyte.Chemical.Formula)
cell_conf_vallist <- unique(data$Cell.Configuration..three.two.electrode.system.)

data_w_enums <- data %>%
  mutate(Electrode.Configuration = match(Electrode.Configuration, elec_conf_vallist)) %>%
  mutate(Electrolyte.Chemical.Formula = match(Electrolyte.Chemical.Formula, elec_chem_vallist)) %>%
  mutate(Cell.Configuration..three.two.electrode.system. = match(Cell.Configuration..three.two.electrode.system., cell_conf_vallist))

ggcorrplot(cor(select(data_w_enums, where(is.numeric))))
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggcorrplot package.
##   Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Interaktywne wykresy

p <- ggplot(data) +
  geom_linerange(aes(x = Capacitance..F.g.,
                     ymin = Lower.Limit.of.Potential.Window..V.,
                     ymax = Upper.Limit.of.Potential.Window..V.,
                     color = Cell.Configuration..three.two.electrode.system.),
                 alpha = 0.15) +
  labs(y = "Potential.Window..V.", color = "Cell Configuration") +
  coord_flip()

ggplotly(p)
n_list <- names(head(sort(table(data$Electrolyte.Chemical.Formula), decreasing = TRUE), 10))
data_filtered <- data %>%
  filter(Electrolyte.Chemical.Formula %in% n_list)
p <- ggplot(data_filtered) +
  geom_linerange(aes(x = Capacitance..F.g.,
                     ymin = Lower.Limit.of.Potential.Window..V.,
                     ymax = Upper.Limit.of.Potential.Window..V.,
                     color = Electrolyte.Chemical.Formula),
                 alpha = 0.2) +
  labs(y = "Potential.Window..V.", color = "Chemical Formula") +
  coord_flip()

ggplotly(p)

Predykcja pojemności za pomocą regresji liniowej

# zmiana kolumn tekstowych na enumy
elec_conf_vallist <- unique(data$Electrode.Configuration)
elec_chem_vallist <- unique(data$Electrolyte.Chemical.Formula)
cell_conf_vallist <- unique(data$Cell.Configuration..three.two.electrode.system.)

ml_data <- data %>%
  mutate(Electrode.Configuration = match(Electrode.Configuration, elec_conf_vallist)) %>%
  mutate(Electrolyte.Chemical.Formula = match(Electrolyte.Chemical.Formula, elec_chem_vallist)) %>%
  mutate(Cell.Configuration..three.two.electrode.system. = match(Cell.Configuration..three.two.electrode.system., cell_conf_vallist)) %>%
  select(-Ref.)

# podział zbioru danych
set.seed(9001)
is_training <- createDataPartition(
  y = ml_data$Capacitance..F.g.,
  p = .75,
  list = FALSE
)

training_data <- ml_data[is_training, ]
testing_data <- ml_data[-is_training, ]

# uczenie
ctrl <- trainControl(
  method = "cv",
  # number = 2,
  # repeats = 10
)
set.seed(1337)
fit <- train(
  Capacitance..F.g. ~ .,
  data = training_data,
  method = "lm",
  trControl = ctrl
)

# predykcja
predictions <- predict(fit, newdata = select(testing_data, -Capacitance..F.g.))

# shap
shapped <- shap(fit, data = testing_data, new_observation = testing_data[1, ])
shapped <- shapped %>%
  rename(attribution = "_attribution_", vname = "_vname_", sign = "_sign_") %>%
  filter(vname != "Capacitance..F.g.")
# plot(shapped)
ggplot(shapped, aes(x = vname, y = attribution, fill = sign)) +
  geom_bar(stat = "identity") + coord_flip() +
  geom_text(aes(label = round(attribution, 2), hjust = "inward")) +
  guides(fill = FALSE)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.